Install/Import Required Packages

suppressMessages({
  if(!require(knitr)){install.packages("knitr")}
  if(!require(ggplot2)){install.packages("ggplot2")}
  if(!require(rpart)){install.packages("rpart")}
  if(!require(rpart.plot)){install.packages("rpart.plot")}
  if(!require(randomForest)){install.packages("randomForest")}
  if(!require(caret)){install.packages("caret")}
  if(!require(factoextra)){install.packages("factoextra")}
  if(!require(FactoMineR)){install.packages("FactoMineR")}
  if (!require("pROC")) {install.packages("pROC")}
  if (!require("ggdendro")) {install.packages("ggdendro")}
  if (!require("mlbench")) {install.packages("mlbench")}
  if (!require("BiocManager", quietly = TRUE)) {install.packages("BiocManager")}
  if (!require("GEOquery", quietly = TRUE)) {install.packages("GEOquery")}
  if (!require("pheatmap", quietly = TRUE)) {install.packages("pheatmap")}
  
  library(knitr)
  library(ggplot2)
  library(rpart)
  library(rpart.plot)
  library(randomForest)
  library(caret)
  library(factoextra)
  library(FactoMineR)
  library(pROC)
  library(ggdendro)
  library(mlbench)
  library(BiocManager)
  library(GEOquery)
  library(pheatmap)
  
})

Supervised Learning

Regression

We will go through two regression techniques in this session:

  1. Linear Regression
  2. Random Forest Regression

Data Set: mtcars

**As a simple data set, we will use the “mtcars” data set in R. The following code can be used to display the data set.**

library(knitr)
data("mtcars")
kable(head(mtcars, 10),caption = "Motor Trend Car Road Tests")
Motor Trend Car Road Tests
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4

**Let’s Display the data set for Miles per Gallon (response) and Weight (predictor).**

library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
mtcars_plot = ggplot(mtcars, aes(wt, mpg)) +
geom_point() + xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')

ggplotly(mtcars_plot)

Linear Regression

Simple Linear Regression Model

R function lm is used model Linear Regression in R. Miles per Gallon (response) and Weight (predictor).

simple_linear_regression = lm(formula = mpg ~ wt ,data = mtcars)
summary(simple_linear_regression)
## 
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10
library(ggplot2)
library(plotly)

linear_reg_gg = ggplot(mtcars, aes(wt, mpg))+ 
  geom_point() + geom_smooth(method="lm", color="red",se=FALSE) +
  geom_segment(aes(xend = wt, yend = (lm(mpg~wt,data = mtcars))$fitted), linetype="dashed") +
  xlim(0,5.5)+
  xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')

ggplotly(linear_reg_gg)
## `geom_smooth()` using formula = 'y ~ x'
Linear Regression Assumptions

Following plots is used to check the assumptions for the Linear Regression is Satisfied.

  • Linearity of Data (Residuals vs Fitted Plot)
  • Normality of Residuals (Normal Q-Q Plot)
  • Constant Variance of Residuals (Scale-Location Plot)
  • No Outliers (Cook’s Distance Plot)
layout(matrix(1:4, byrow = T, ncol = 2))
plot(simple_linear_regression, which = 1:4)

Multiple Linear Regression Model

Let’s add more input variables (or predictor variables) to the linear regression model.

\[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~ Cylinders + \beta_3 ~Rear\_axle\_Ratio + Error \]

multiple_linear_regression = lm(formula = mpg ~ wt + cyl + drat ,data = mtcars)
summary(multiple_linear_regression)
## 
## Call:
## lm(formula = mpg ~ wt + cyl + drat, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2944 -1.5576 -0.4667  1.5678  6.1014 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.7677     6.8729   5.786 3.26e-06 ***
## wt           -3.1947     0.8293  -3.852 0.000624 ***
## cyl          -1.5096     0.4464  -3.382 0.002142 ** 
## drat         -0.0162     1.3231  -0.012 0.990317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.613 on 28 degrees of freedom
## Multiple R-squared:  0.8302, Adjusted R-squared:  0.812 
## F-statistic: 45.64 on 3 and 28 DF,  p-value: 6.569e-11

Since the Rear axle ratio is not significance (p-value > 0.05), we can remove that variable and run the model again. \[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~ Cylinders + Error \]

multiple_linear_regression = lm(formula = mpg ~ wt + cyl ,data = mtcars)
summary(multiple_linear_regression)
## 
## Call:
## lm(formula = mpg ~ wt + cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2893 -1.5512 -0.4684  1.5743  6.1004 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.6863     1.7150  23.141  < 2e-16 ***
## wt           -3.1910     0.7569  -4.216 0.000222 ***
## cyl          -1.5078     0.4147  -3.636 0.001064 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.568 on 29 degrees of freedom
## Multiple R-squared:  0.8302, Adjusted R-squared:  0.8185 
## F-statistic: 70.91 on 2 and 29 DF,  p-value: 6.809e-12
Model Selection with Multiple Linear Regression Model

Full Model

full_model_linear_regression = lm(formula = mpg ~ . ,data = mtcars)
summary(full_model_linear_regression)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

Backward Elimination

This method starts with the full model and eliminates unimportant predictor variables from the model.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
backward_model = stepAIC(full_model_linear_regression, direction = "backward", trace = FALSE)
summary(backward_model)
## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

Stepwise Method

This method starts with the one variable and add or removes predictor variables systematically.

library(MASS)
stepwise_model = stepAIC(full_model_linear_regression, direction = "both", trace = FALSE)
summary(stepwise_model)
## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

Random Forest Regression

Regression Trees

Partition the data set into decision boundaries.

library(rpart)
library(rpart.plot)
ctrl = list(cp = 0.00001, minbucket = 1, maxdepth = 2)
tree = rpart(mpg ~ wt+cyl, data = mtcars,control = ctrl, method = 'anova')
rpart.plot(tree)

library(ggplot2)
ggplot(mtcars, aes(y=wt, x=cyl, color=mpg)) +
geom_point() + 
geom_hline(yintercept=2.3, linetype="dashed", 
                color = "red", linewidth=1) + 
  geom_vline(xintercept=7, linetype="dashed", 
                color = "purple", linewidth=1)

Random Forest Regression

R code for Random Forest Regression

library(randomForest)
Random_Forest_model = randomForest(formula = mpg ~ wt+cyl+drat, data = mtcars, ntree=500, importance=TRUE)
  • Get important features/variables from the model
important_features = as.data.frame(importance(Random_Forest_model)) 

important_features$feature_name = row.names(important_features)

ggplot(important_features, aes(x=feature_name, y=`%IncMSE`)) + 
  geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`%IncMSE`), color="orange") +
  geom_point(aes(size = IncNodePurity), color="darkorange", alpha=0.9) + 
  theme_minimal() + 
  coord_flip() + 
  theme( legend.position="None") + 
  xlab('Features') + ylab('Importance')

Classification

We will go through two classifications models in this section.

  1. Logistic Regression
  2. Random Forest Classification

Data Set: Heart Disease Data Set

In this section, heart disease (heart_df) data set is used to apply classification models.

cholesterol   = c(180,200,195,245,205,130,155,260,175,286,203,192, 256, 294)
age           = c(30,44,32,23,57,62,34,37,30,73,40,62, 37, 66)
heart_disease = as.numeric(c('0','1','1','1','1','0','0','1','0','1','0','1','1','1'))
heart_df = data.frame(cholesterol,age,heart_disease)

heart_df
##    cholesterol age heart_disease
## 1          180  30             0
## 2          200  44             1
## 3          195  32             1
## 4          245  23             1
## 5          205  57             1
## 6          130  62             0
## 7          155  34             0
## 8          260  37             1
## 9          175  30             0
## 10         286  73             1
## 11         203  40             0
## 12         192  62             1
## 13         256  37             1
## 14         294  66             1
heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = age, col = as.factor(heart_disease) )) +
    geom_point()+
    guides(color = guide_legend(title = "Heart Disease"))+
    xlab('Cholestoral') + ylab('Age in Years')
ggplotly(heart_plot)

Logistic Regression

Display Data:

library(ggplot2)

heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = heart_disease)) +
    geom_point()+
    geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
    xlab('Cholesterol') + ylab('Heart Disease')

ggplotly(heart_plot)
## `geom_smooth()` using formula = 'y ~ x'

\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol)}{1+exp(\beta_0+\beta_1~Cholesterol)}\]

logistic_regression = glm(formula = heart_disease ~ cholesterol  ,data = heart_df,family = 'binomial')
summary(logistic_regression)
## 
## Call:
## glm(formula = heart_disease ~ cholesterol, family = "binomial", 
##     data = heart_df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -25.24143   17.94588  -1.407    0.160
## cholesterol   0.13247    0.09289   1.426    0.154
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18.2492  on 13  degrees of freedom
## Residual deviance:  7.2165  on 12  degrees of freedom
## AIC: 11.216
## 
## Number of Fisher Scoring iterations: 8

\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol + \beta_2 Age\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}{1+exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}\]

logistic_regression = glm(formula = heart_disease ~ cholesterol + age ,
                          data = heart_df,family = 'binomial')

summary(logistic_regression)
## 
## Call:
## glm(formula = heart_disease ~ cholesterol + age, family = "binomial", 
##     data = heart_df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -24.31146   14.88515  -1.633    0.102
## cholesterol   0.10649    0.07342   1.450    0.147
## age           0.09857    0.10119   0.974    0.330
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18.2492  on 13  degrees of freedom
## Residual deviance:  6.0047  on 11  degrees of freedom
## AIC: 12.005
## 
## Number of Fisher Scoring iterations: 8

Get Odd Rations from Parameters Estimates and interpretations:

exp(coef(logistic_regression))
##  (Intercept)  cholesterol          age 
## 2.764816e-11 1.112364e+00 1.103590e+00
  • The odds of getting a heart disease will increase by 11% (i.e., exp(0.10)=1.11) when the Cholesterol level increases by 1 unit.

  • The odds of getting heart disease will increase by 10% (i.e., exp(0.09)=1.10) when the age increases by 1 year.

Predict for new set of data with Logistic Regression:

  • New Data 1: A person with Age = 25 and cholesterol level 150.
  • New Data 2: A person with Age = 65 and cholesterol level 250.
predict(logistic_regression, 
        newdata = data.frame(cholesterol = c(150, 250), age = c(25,65)), 
        type = "response")
##           1           2 
## 0.002803361 0.999836308
  • Probability of getting a heart disease for a person with age 25 years and cholesterol level of 150 is 0.2%.
  • Probability of getting a heart disease for a person with age 65 years and cholesterol level of 250 is 99 %.

Random Forest Classification

library(randomForest)

Random_Forest_Classifier = randomForest(as.factor(heart_disease) ~ cholesterol + age, data=heart_df, ntree=500, importance = TRUE)
important_features = as.data.frame(importance(Random_Forest_Classifier)) 

important_features$feature_name <- row.names(important_features)

ggplot(important_features, aes(x=feature_name, y=`MeanDecreaseGini`)) + 
  geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`MeanDecreaseGini`), color="orange") +
  geom_point(aes(size = MeanDecreaseAccuracy), color="darkorange", alpha=0.9) + 
  theme_minimal() + 
  coord_flip() + 
  theme(legend.position = "none") +
  xlab('Features') + ylab('Importance')

Predict New Data with Random Forest:

predict( Random_Forest_Classifier, 
         data.frame(cholesterol = c(150, 250), age = c(25,65)))
## 1 2 
## 0 1 
## Levels: 0 1
  • A person with age 25 years and cholesterol level of 150 is predicted to be not getting an heart disease
  • A person with age 65 years and cholesterol level of 250 is predicted to be getting an heart disease

Evaluation of Classification Models

Confusion matrix, Sensitivity and Specificity

library(caret)
random_forest_prediction = predict(Random_Forest_Classifier)
observed_data = as.factor(heart_df$heart_disease)
confusionMatrix( random_forest_prediction, observed_data,positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 3 2
##          1 2 7
##                                          
##                Accuracy : 0.7143         
##                  95% CI : (0.419, 0.9161)
##     No Information Rate : 0.6429         
##     P-Value [Acc > NIR] : 0.4007         
##                                          
##                   Kappa : 0.3778         
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.7778         
##             Specificity : 0.6000         
##          Pos Pred Value : 0.7778         
##          Neg Pred Value : 0.6000         
##              Prevalence : 0.6429         
##          Detection Rate : 0.5000         
##    Detection Prevalence : 0.6429         
##       Balanced Accuracy : 0.6889         
##                                          
##        'Positive' Class : 1              
## 

ROC Curve

library(pROC)
roc_logistic <- roc(heart_df$heart_disease, as.numeric(predict(logistic_regression)),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)

roc_random_forest <- roc(heart_df$heart_disease, as.numeric(random_forest_prediction),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)

#plot(roc_logistic, main = "ROC Curve")
#lines(roc_random_forest,col='blue')

Over-fitting & Cross-Validation

Train-test Split

Let’s use mtcars data set to apply train-test split. We split as train data set will have 70% and test data set will have 30%.

set.seed(1234)
sample <- sample(c(TRUE, FALSE), nrow(mtcars), replace=TRUE, prob=c(0.7,0.3))
mtcars_train  <- mtcars[sample, ]
mtcars_test   <- mtcars[!sample, ]

Train Data Set Size

dim(mtcars_train)
## [1] 26 11

Test Data Set Size

dim(mtcars_test)
## [1]  6 11

K-Fold Cross Validation

Apply k-fold cross validation for mtcars training data set.

Cross Validation with Linear Regression

library(caret)

cross_validate_setup <- trainControl(method = "cv", number = 4) # Cross validation with 4 folds

linear_regression_cv = train(mpg ~ .,
                     data = mtcars_train,                        
                     trControl = cross_validate_setup,           
                     method = "lm",
                     metric = 'RMSE'       # RMSE is used to compare the model
                     )

linear_regression_cv
## Linear Regression 
## 
## 26 samples
## 10 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 19, 19, 20, 20 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   3.765965  0.7585956  2.974284
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Cross Validation with Random Forest Regression

library(caret)

cross_validate_setup <- trainControl(method = "cv", number = 4) # Cross validation with 4 folds

random_forest_regression_cv = train(mpg ~ .,
                     data = mtcars_train,                        
                     trControl = cross_validate_setup,           
                     method = "rf",                      # Random Forest
                     metric = 'RMSE',
                     tuneGrid = expand.grid(.mtry = c(1 : 10)), #Grid Search with mtry from 1 to 10
                     na.action = na.pass)

random_forest_regression_cv
## Random Forest 
## 
## 26 samples
## 10 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 18, 21, 20, 19 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    1    2.685053  0.8574958  2.111840
##    2    2.426804  0.8694192  1.875677
##    3    2.376424  0.8723288  1.751273
##    4    2.366303  0.8671525  1.705216
##    5    2.340886  0.8646725  1.654538
##    6    2.316080  0.8670054  1.647052
##    7    2.386482  0.8635178  1.702981
##    8    2.315735  0.8655307  1.658174
##    9    2.344355  0.8587729  1.674820
##   10    2.283882  0.8637430  1.647277
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 10.

Random Forest Regression with mtry = 10 is the best model compared to the Linear Regression as the RMSE is the lowest.

Unsupervised Learning

K-Means Clustering

Lets apply K-Means Clustering for mtcars data set

set.seed(123)

k_means_clustering = kmeans(mtcars, centers = 2)

print(k_means_clustering$cluster)
##           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
##                   1                   1                   1                   1 
##   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
##                   2                   1                   2                   1 
##            Merc 230            Merc 280           Merc 280C          Merc 450SE 
##                   1                   1                   1                   2 
##          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
##                   2                   2                   2                   2 
##   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
##                   2                   1                   1                   1 
##       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
##                   1                   2                   2                   2 
##    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
##                   2                   1                   1                   1 
##      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
##                   2                   1                   2                   1

Plot K-Means Clustering using factoextra library

library(factoextra)
fviz_cluster(k_means_clustering, data = mtcars)

The following two methods can be used to check best number of clusters.

fviz_nbclust(mtcars, kmeans, method = "silhouette")

fviz_nbclust(mtcars, kmeans, method = "wss")

Hierarchical Clustering

Lets apply Hierarchical-Means Clustering for mtcars data set

library(ggdendro)
set.seed(123)
mtcars_distance = dist(mtcars, method = 'euclidean')
h_clustering = hclust(mtcars_distance)
ggdendrogram(h_clustering)

Hierarchical-Means Clustering based heatmaps are popular in analyzing gene expression data

Data Source: https://bioconductor.org/packages/devel/data/experiment/html/celldex.html

library(BiocManager)
library(GEOquery)
library(pheatmap)

gse <- getGEO("GSE33126")

dim((exprs(gse[[1]])))
## [1] 48803    18
gene_exp = t(exprs(gse[[1]]))
data_subset <- as.matrix(gene_exp[,colSums(gene_exp)>500000])
dist = dist(t(data_subset) , diag=TRUE)
hc   = hclust(dist)
dhc  = as.dendrogram(hc)
#ggdendrogram(hc)
pheatmap(scale(data_subset))

PCA Biplot

Principal components reduce the data dimensions and allows to visualize the correlation structure of the data effectivly

library(FactoMineR)

pca_mtcars <- PCA(mtcars,  graph = FALSE)

fviz_pca_biplot(pca_mtcars, repel = TRUE)